We are collecting a dataset on water quality to train a machine learning model for binary classification: determining whether water is safe for consumption (1) or not (0). This model will help with water treatment decisions and ensure compliance with quality standards. We applied different summarization and plotting methods to help us to understand our dataset, such as scatter, histogram and bar plot. Then, we applyed preprocess in our data using data cleaning, data transformation and feature selection.
#library:
#install.packages("caret")
#install.packages("glmnet")
#install.packages("Boruta")
#install.packages("mlbench")
#install.packages("randomForest")
library(outliers)
library(dplyr)
library(mlbench)
library(caret)
library(glmnet)
library(Boruta)
library(ggplot2)
library(randomForest)
getwd()
#setwd("/Users/mahayie/Desktop/326p")
#getwd()
water_potability = read.csv('Dataset/water_potability.csv')
View(water_potability)
str(water_potability)
summary(water_potability)
Checking for missing values:
dim(water_potability)
[1] 3276 10
sum(is.na(water_potability))
[1] 1434
Remove rows with missing values
water_potability = na.omit(water_potability)
View(water_potability)
Description: The absence of data in certain variables or columns in a dataset is referred to as missing or null values due to various reasons. It can have a negative impact on the dataset’s efficiency and the information that can be taken from it later, so we checked to see whether our data had missing or null values and eliminated these rows to produce a more efficient dataset.
Standard deviation:
sd(water_potability$Turbidity)
[1] 0.7803462
sd(water_potability$Solids)
[1] 8642.24
sd(water_potability$Conductivity)
[1] 80.71257
sd(water_potability$Organic_carbon)
[1] 3.324959
sd(water_potability$ph)
[1] 1.573337
Mean:
mean(water_potability$Turbidity)
[1] 3.969729
mean(water_potability$Solids)
[1] 21917.44
mean(water_potability$Conductivity)
[1] 426.5264
mean(water_potability$Organic_carbon)
[1] 14.35771
mean(water_potability$ph)
[1] 7.08599
Median
median(water_potability$Turbidity)
[1] 3.968177
median(water_potability$Solids)
[1] 20933.51
median(water_potability$Conductivity)
[1] 423.4559
median(water_potability$Organic_carbon)
[1] 14.32202
median(water_potability$ph)
[1] 7.027297
Variance
var(water_potability$Turbidity)
[1] 0.6089401
var(water_potability$Solids)
[1] 74688309
var(water_potability$Conductivity)
[1] 6514.519
var(water_potability$Organic_carbon)
[1] 11.05535
var(water_potability$ph)
[1] 2.475388
Statistical Measures:
summary(water_potability$Conductivity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
201.6 366.7 423.5 426.5 482.4 753.3
summary(water_potability$Organic_carbon)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.20 12.12 14.32 14.36 16.68 27.01
summary(water_potability$Hardness)
Min. 1st Qu. Median Mean 3rd Qu. Max.
73.49 176.74 197.19 195.97 216.44 317.34
summary(water_potability$Solids)
Min. 1st Qu. Median Mean 3rd Qu. Max.
320.9 15615.7 20933.5 21917.4 27182.6 56488.7
summary(water_potability$Chloramines)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.391 6.139 7.144 7.134 8.110 13.127
summary(water_potability$Potability)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.4033 1.0000 1.0000
summary(water_potability$Sulfate)
Min. 1st Qu. Median Mean 3rd Qu. Max.
129.0 307.6 332.2 333.2 359.3 481.0
summary(water_potability$Trihalomethanes)
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.577 55.953 66.542 66.401 77.292 124.000
summary(water_potability$Turbidity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.450 3.443 3.968 3.970 4.514 6.495
summary(water_potability$ph)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2275 6.0897 7.0273 7.0860 8.0530 14.0000
Descriotion: With using minimum, maximum, mean, median laws it helps to provide an overview of the data’s key characteristics
outliers before removing outlier:
dim(water_potability)
[1] 2011 10
head(water_potability)
removing outliers:
summary(water_potability$ph)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2275 6.0897 7.0273 7.0860 8.0530 14.0000
quartiles <- quantile(water_potability$ph, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
6.089723 8.052969
iqr <- IQR(water_potability$ph)
iqr
[1] 1.963245
lower <- quartiles[1] - 1.5*iqr
lower
25%
3.144855
upper <- quartiles[2] + 1.5*iqr
upper
75%
10.99784
boxplot(ph ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$ph, ylab = 'ph')$out
out_val
out_rows <- which(water_potability$ph %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$ph)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.231 6.105 7.027 7.087 8.030 10.905
#-------------------------------------------
-Hardness
summary(water_potability$Hardness)
Min. 1st Qu. Median Mean 3rd Qu. Max.
73.49 176.90 197.36 196.27 216.44 317.34
quartiles <- quantile(water_potability$Hardness, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
176.9031 216.4411
iqr <- IQR(water_potability$Hardness)
iqr
[1] 39.53799
lower <- quartiles[1] - 1.5*iqr
lower
25%
117.5961
upper <- quartiles[2] + 1.5*iqr
upper
75%
275.7481
boxplot(Hardness ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Hardness, ylab = 'Hardness')$out
out_val
out_rows <- which(water_potability$Hardness %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Hardness)
Min. 1st Qu. Median Mean 3rd Qu. Max.
121.0 177.7 197.3 196.2 215.5 272.1
#-------------------------------------------
-Solids
summary(water_potability$Solids)
Min. 1st Qu. Median Mean 3rd Qu. Max.
320.9 15704.5 20855.3 21840.2 27045.9 56488.7
quartiles <- quantile(water_potability$Solids, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
15704.48 27045.93
iqr <- IQR(water_potability$Solids)
iqr
[1] 11341.45
lower <- quartiles[1] - 1.5*iqr
lower
25%
-1307.69
upper <- quartiles[2] + 1.5*iqr
upper
75%
44058.1
boxplot(Solids ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Solids, ylab = 'Solids')$out
out_val
out_rows <- which(water_potability$Solids %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Solids)
Min. 1st Qu. Median Mean 3rd Qu. Max.
320.9 15547.5 20518.7 21419.6 26734.7 43195.5
#-------------------------------------------
-Chloramines
summary(water_potability$Chloramines)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.391 6.141 7.135 7.135 8.094 13.127
quartiles <- quantile(water_potability$Chloramines, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
6.141236 8.094323
iqr <- IQR(water_potability$Chloramines)
iqr
[1] 1.953087
lower <- quartiles[1] - 1.5*iqr
lower
25%
3.211605
upper <- quartiles[2] + 1.5*iqr
upper
75%
11.02395
boxplot(Chloramines ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Chloramines, ylab = 'Chloramines')$out
out_val
out_rows <- which(water_potability$Chloramines %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Chloramines)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.352 6.181 7.137 7.136 8.076 10.897
#-------------------------------------------
-Sulfate
summary(water_potability$Sulfate)
Min. 1st Qu. Median Mean 3rd Qu. Max.
187.2 308.2 332.6 333.4 358.3 481.0
quartiles <- quantile(water_potability$Sulfate, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
308.1884 358.3020
iqr <- IQR(water_potability$Sulfate)
iqr
[1] 50.11358
lower <- quartiles[1] - 1.5*iqr
lower
25%
233.0181
upper <- quartiles[2] + 1.5*iqr
upper
75%
433.4724
boxplot(Sulfate ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Sulfate, ylab = 'Sulfate')$out
out_val
out_rows <- which(water_potability$Sulfate %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Sulfate)
Min. 1st Qu. Median Mean 3rd Qu. Max.
237.5 309.2 332.8 333.6 357.7 429.8
#-------------------------------------------
-Conductivity
summary(water_potability$Conductivity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
201.6 366.6 423.6 426.8 482.6 753.3
quartiles <- quantile(water_potability$Conductivity, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
366.5581 482.5983
iqr <- IQR(water_potability$Conductivity)
iqr
[1] 116.0401
lower <- quartiles[1] - 1.5*iqr
lower
25%
192.4979
upper <- quartiles[2] + 1.5*iqr
upper
75%
656.6585
boxplot(Conductivity ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Conductivity, ylab = 'Conductivity')$out
out_val
out_rows <- which(water_potability$Conductivity %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Conductivity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
201.6 366.4 423.1 426.0 481.9 652.5
#-------------------------------------------
-Organic_carbon
summary(water_potability$Organic_carbon)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.372 12.184 14.351 14.417 16.788 27.007
quartiles <- quantile(water_potability$Organic_carbon, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
12.18447 16.78779
iqr <- IQR(water_potability$Organic_carbon)
iqr
[1] 4.603315
lower <- quartiles[1] - 1.5*iqr
lower
25%
5.279502
upper <- quartiles[2] + 1.5*iqr
upper
75%
23.69276
boxplot(Organic_carbon ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Organic_carbon, ylab = 'Organic_carbon')$out
out_val
out_rows <- which(water_potability$Organic_carbon %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Organic_carbon)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.512 12.222 14.352 14.426 16.786 23.604
#-------------------------------------------
-Trihalomethanes
summary(water_potability$Trihalomethanes)
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.577 55.865 66.231 66.364 77.418 124.000
quartiles <- quantile(water_potability$Trihalomethanes, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
55.86494 77.41789
iqr <- IQR(water_potability$Trihalomethanes)
iqr
[1] 21.55295
lower <- quartiles[1] - 1.5*iqr
lower
25%
23.53552
upper <- quartiles[2] + 1.5*iqr
upper
75%
109.7473
boxplot(Trihalomethanes ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Trihalomethanes, ylab = 'Trihalomethanes')$out
out_val
out_rows <- which(water_potability$Trihalomethanes %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Trihalomethanes)
Min. 1st Qu. Median Mean 3rd Qu. Max.
24.53 55.96 66.29 66.42 77.34 108.85
#-------------------------------------------
-Turbidity
summary(water_potability$Turbidity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.450 3.441 3.975 3.973 4.519 6.495
quartiles <- quantile(water_potability$Turbidity, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
3.440859 4.518751
iqr <- IQR(water_potability$Turbidity)
iqr
[1] 1.077892
lower <- quartiles[1] - 1.5*iqr
lower
25%
1.824021
upper <- quartiles[2] + 1.5*iqr
upper
75%
6.135588
boxplot(Turbidity ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Turbidity, ylab = 'Turbidity')$out
out_val
out_rows <- which(water_potability$Turbidity %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Turbidity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.873 3.443 3.974 3.972 4.512 6.084
After removing outliers:
dim(water_potability)
[1] 1750 10
str(water_potability)
'data.frame': 1750 obs. of 10 variables:
$ ph : num 8.32 9.09 5.58 10.22 8.64 ...
$ Hardness : num 214 181 188 248 203 ...
$ Solids : num 22018 17979 28749 28750 13672 ...
$ Chloramines : num 8.06 6.55 7.54 7.51 4.56 ...
$ Sulfate : num 357 310 327 394 303 ...
$ Conductivity : num 363 398 280 284 475 ...
$ Organic_carbon : num 18.4 11.6 8.4 13.8 12.4 ...
$ Trihalomethanes: num 100.3 32 54.9 84.6 62.8 ...
$ Turbidity : num 4.63 4.08 2.56 2.67 4.4 ...
$ Potability : int 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "na.action")= 'omit' Named int [1:1265] 1 2 3 9 12 14 15 17 19 21 ...
..- attr(*, "names")= chr [1:1265] "1" "2" "3" "9" ...
head(water_potability)
Description: Removing outliers from a dataset is critical for assuring the quality and reliability of statistical analysis and machine learning models. We found all outliers in the numerical attributes and subsequently eliminated the rows containing the outliers.
Charts
Histogram
hist(water_potability$ph)
hist(water_potability$Chloramines)
hist(water_potability$Hardness)
hist(water_potability$Solids)
hist(water_potability$Sulfate)
hist(water_potability$Conductivity)
hist(water_potability$Organic_carbon)
hist(water_potability$Trihalomethanes)
hist(water_potability$Turbidity)
Bar Plot
tab <- water_potability$Potability %>% table()
txt <- paste0(tab)
bb <- water_potability$ph %>% table() %>% barplot( main='ph',col=c('pink'))
bb <- water_potability$Potability %>% table() %>% barplot( main='Potability',ylab='Frequency',col=c('pink', 'lightblue'))
text(bb, tab/2, labels=txt, cex=1)
Pie chart
water_potability$Potability %>% table() %>% pie()
Scatter Plot
with(water_potability, plot(Turbidity, ph, col = Potability, pch = as.numeric(Potability)))
Description: -Histogram: The histogram shows the frequency of ph in the dataset; we noted that the majority of values fall within the usual range, which is about between 6 and 8, but it also shows several outliers. -Scatter plot: This scatter demonstrates the correlation and proportionality between the two qualities, allowing us to establish whether or not turbidity and pH are connected. -Bar Plot the bar plot represent how ph levels affect water portability in the dataset it indicates that ph level above 10 is not portibal and humans cant consume it
Remove Redundant Features:
correlation_matrix <- cor(water_potability[,1:9])
high_correlation_features <- findCorrelation(correlation_matrix, cutoff = 0.5)
print(high_correlation_features)
integer(0)
heatmap(correlation_matrix)
Description: This will find the correlation between the features and represent it in heat map
Feature selection
Rank Features By Importance:
#train random forest model and calculate feature importance
rf = randomForest(x= water_potability[,1:9],y= water_potability[,10])
var_imp <- varImp(rf, scale = FALSE)
#sort the score in decreasing order
var_imp_df <- data.frame(cbind(variable = rownames(var_imp), score = var_imp[,1]))
var_imp_df$score <- as.double(var_imp_df$score)
var_imp_df[order(var_imp_df$score,decreasing = TRUE),]
ggplot(var_imp_df, aes(x=reorder(variable, score), y=score)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=score)) +
ylab("IncNodePurity") +
xlab("Variable Name") +
coord_flip()
Recursive Feature elimination:
control <- rfeControl(functions=rfFuncs, method="cv",number=10)
rf <- trainControl(method = "cv", number = 10, verboseIter = FALSE)
# run the RFE algorithm
rfe_model <- rfe(x= water_potability[,1:9],y= water_potability[,10], sizes=c(1:9), rfeControl=control)
# summarize the results
print(rfe_model)
Recursive feature selection
Outer resampling method: Cross-Validated (10 fold)
Resampling performance over subset size:
The top 5 variables (out of 5):
Sulfate, ph, Hardness, Solids, Chloramines
# list the chosen features
predictors(rfe_model)
[1] "Sulfate" "ph" "Hardness" "Solids" "Chloramines"
# plot the results
plot(rfe_model, type=c("g", "o"))
Description: ranking features by importance is a technique used to identify the most influential variables in a dataset for predicting a target variable. This process helps in understanding which features have the most impact on the model’s performance. By ranking features by importance.
removing redundant features refers to the process of eliminating variables or features from a dataset that do not provide any additional or unique information.
Data transformation
Normlization
normalize=function(x){return ((x-min(x))/(max(x)))}
w = water_potability
water_potability$Sulfate=normalize(water_potability$Sulfate)
water_potability$Conductivity=normalize(water_potability$Conductivity)
water_potability$Turbidity=normalize(water_potability$Turbidity)
water_potability$ph=normalize(water_potability$ph)
water_potability$Chloramines=normalize(water_potability$Chloramines)
water_potability$Solids=normalize(water_potability$Solids)
water_potability$Trihalomethanes=normalize(water_potability$Trihalomethanes)
water_potability$Organic_carbon=normalize(water_potability$Organic_carbon)
water_potability$Hardness=water_potability$Hardness/1000
water_potability$Hardness<-normalize(water_potability$Hardness)
print(water_potability)
Description: Normalization refers to the process of scaling variables to have a common range. It helps in comparing variables with different scales. In the solids attribute will create critical challenges since of the huge and diverted values(min=320,9 max=43195.5) so we normalized the solids to make values smaller and more reasonable. Also we normalized all the scaled attributes:Sulfate,Conductivity,Organic_corbon, Trihalomerhanes, Turbidity,ph,Chloramines.
Discretization:
w$Trihalomethanes= cut(w$Trihalomethanes, breaks = seq(0,125,by=25),right=FALSE)
w$Solids= cut(w$Solids, breaks = seq(0,50000,by=10000),right=FALSE)
w$Organic_carbon= cut(w$Organic_carbon, breaks = seq(0,25,by=5),right=FALSE)
print(w)
Description: Discretization is the process of transforming continuous variables into discrete or categorical variables. It’s can be useful for analyzing data that has a large number of unique values or when you want to simplify the data. So In Trihalomethanes we intervals by dividing the values by 25 to have a labels with equal width : (0,25],(25,50],(50,75],(75,100],(100,125].
Encoding encoding is the process of converting characters or strings into a specific encoding format. Since we don’t have a Nominal attribute in our database we couldn’t implement it.